import pandas as pd
from pandas import json_normalize
from sklearn.manifold import TSNE
import requests
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import cufflinks as cf
import pandas as pd
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import plotly.graph_objs as go
import nltk.corpus
from textblob import TextBlob
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from plotly.offline import iplot
from matplotlib.pyplot import figure
#nltk.download('stopwords')
warnings.filterwarnings("ignore")
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
%matplotlib inline
apikey = open("creds_guardian.txt").read().strip()
def search_guardian_articles(api_key, search_term='', page=1, page_size=1000, format_='json'):
'''Retrieves meta data of articles matching the search term'''
search_term = search_term.replace(' ', '%20')
# Now, we'll make the request
url = 'https://content.guardianapis.com/search'
params = {'api-key':apikey,
'format':'json',
'page':page,
'page-size':page_size,
'q':search_term}
response = requests.get(url, params=params)
return response.json()
def guardian_articles_dataframe(api_key, search_term='', number_of_records=1000):
'''Returns a dataframe with article information from the Guardian API
var:
search_term: Query string passed to the Guardian API to search the server database
api_key: key required to access the Guardian API. Available for free from Guardian Developer website
number_of_records: Indicates the number of records to return in the dataframe'''
# Instantiate the pandas dataframe
df = pd.DataFrame()
# Iterate through a series of API calls to retrieve the records, and append to dataframe
for i in range(1,int(number_of_records / 200 + 1)):
try:
data = json_normalize(search_guardian_articles(api_key=apikey, search_term=search_term, page_size=200, page=i)['response']['results'])
df = df.append(data)
except:
break
# Reset index and return the dataframe
df = df.reset_index(drop=True)
return df[:number_of_records]
# Search the Guardian API with a term to get our dataset
term = 'Ukraine Russia War'
df = guardian_articles_dataframe(api_key=apikey, search_term=term, number_of_records=1000)
df.head()
id | type | sectionId | sectionName | webPublicationDate | webTitle | webUrl | apiUrl | isHosted | pillarId | pillarName | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | world/2022/dec/02/finland-pm-sanna-marin-says-... | article | world | World news | 2022-12-02T06:23:23Z | Finland PM Sanna Marin says Europe is ‘not str... | https://www.theguardian.com/world/2022/dec/02/... | https://content.guardianapis.com/world/2022/de... | False | pillar/news | News |
1 | world/live/2023/feb/05/russia-ukraine-war-situ... | liveblog | world | World news | 2023-02-05T18:07:46Z | Russia-Ukraine war live: Ukraine ‘expects poss... | https://www.theguardian.com/world/live/2023/fe... | https://content.guardianapis.com/world/live/20... | False | pillar/news | News |
2 | world/live/2023/jan/28/russia-ukraine-war-zele... | liveblog | world | World news | 2023-01-28T17:56:20Z | Russia-Ukraine war live: Ukraine struggling to... | https://www.theguardian.com/world/live/2023/ja... | https://content.guardianapis.com/world/live/20... | False | pillar/news | News |
3 | world/live/2022/dec/26/russia-ukraine-war-live... | liveblog | world | World news | 2022-12-26T19:52:39Z | Russia-Ukraine war live: Ukraine aiming for pe... | https://www.theguardian.com/world/live/2022/de... | https://content.guardianapis.com/world/live/20... | False | pillar/news | News |
4 | world/live/2023/jan/11/russia-ukraine-war-live... | liveblog | world | World news | 2023-01-11T19:01:42Z | Russia-Ukraine war: Putin replaces general in ... | https://www.theguardian.com/world/live/2023/ja... | https://content.guardianapis.com/world/live/20... | False | pillar/news | News |
# Filtering Unnecessary Columns
df = df.filter(items=['sectionName','webTitle','webUrl','pillarName'])
# Extracting all those rows with Pillarname = "News"
news = df['pillarName'] == "News"
df = df[news]
stop_words = stopwords.words('english')
df['clean_title'] = df['webTitle'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
stop_words = stopwords.words('english') + ['At', 'v', '3']
# apply the same code as above but assign
# a new column to see the differences
df['clean_title_v2'] = df['webTitle'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
# Using Regex to remove unwanted text
def preprocess(webTitle):
webTitle = webTitle.str.replace("(<br/>)", "")
webTitle = webTitle.str.replace('(<a).*(>).*(</a>)', '')
webTitle = webTitle.str.replace('(&)', '')
webTitle = webTitle.str.replace('(>)', '')
webTitle = webTitle.str.replace('(<)', '')
webTitle = webTitle.str.replace('(\xa0)', ' ')
return webTitle
df['clean_title_v2'] = preprocess(df['clean_title_v2'])
df['polarity'] = df['webTitle'].map(lambda text: TextBlob(text).sentiment.polarity)
df['webTitle_len'] = df['webTitle'].astype(str).apply(len)
df['word_count'] = df['webTitle'].apply(lambda x: len(str(x).split()))
print('5 random reviews with the highest positive sentiment polarity: \n')
cl = df.loc[df.polarity > 0, ['clean_title_v2']].sample(5).values
for c in cl:
print(c[0])
5 random reviews with the highest positive sentiment polarity: Kherson fell quickly, Ukraine’s progress east Dnipro harder Russia-Ukraine war latest: know day 177 invasion Russia-Ukraine war latest: know day 185 invasion ‘Ukraine definitely win’ says president visit Mykolaiv Russia-Ukraine war latest: know day 202 invasion
print('5 random reviews with the most neutral sentiment(zero) polarity: \n')
cl = df.loc[df.polarity == 0, ['clean_title_v2']].sample(5).values
for c in cl:
print(c[0])
5 random reviews with the most neutral sentiment(zero) polarity: Russia-Ukraine war: Russia accused demolishing Mariupol theatre ‘to hide war crimes’ – happened Russia bombs Kharkiv Ukraine claims ‘tactical successes’ Russia-Ukraine war glance: know day 285 invasion Russia-Ukraine war: know day 151 invasion ‘It’s madness’: Ukraine holds breath Putin turns nuclear plant frontline
print('5 reviews with the most negative polarity: \n')
cl = df.loc[df.polarity <0, ['clean_title_v2']].sample(5).values
for c in cl:
print(c[0])
5 reviews with the most negative polarity: Ukrainian adviser quits claims Russian missile killed dozens Saudi foreign minister defends role securing Ukraine prisoner swaps Hundreds civilians trapped Soledar amid fierce fighting, Ukraine says Global carbon emissions forecast cut due Ukraine war Biden, says BP Why west risks condemning Ukraine slow strangulation
#Check Missing Values
percent_missing = df.isnull().sum() * 100 / len(df)
percent_missing = round(percent_missing, 2)
percent_missing = percent_missing.astype(str) + '%'
percent_missing
sectionName 0.0% webTitle 0.0% webUrl 0.0% pillarName 0.0% clean_title 0.0% clean_title_v2 0.0% polarity 0.0% webTitle_len 0.0% word_count 0.0% dtype: object
#Checking Number of duplicate Rows
dup_percentage = df.duplicated().sum()/len(df)*100
dup_percentage = round(dup_percentage,2)
dup_percentage = dup_percentage.astype(str) + '%'
dup_percentage
'2.41%'
#Dropping Duplicate Rows
df = df.drop_duplicates()
#Checking Duplicate rows again
dup_percentage = df.duplicated().sum()/len(df)*100
dup_percentage = round(dup_percentage,2)
dup_percentage = dup_percentage.astype(str) + '%'
dup_percentage
'0.0%'
#Finalized df
df.head()
sectionName | webTitle | webUrl | pillarName | clean_title | clean_title_v2 | polarity | webTitle_len | word_count | |
---|---|---|---|---|---|---|---|---|---|
0 | World news | Finland PM Sanna Marin says Europe is ‘not str... | https://www.theguardian.com/world/2022/dec/02/... | News | Finland PM Sanna Marin says Europe ‘not strong... | Finland PM Sanna Marin says Europe ‘not strong... | -0.108333 | 72 | 13 |
1 | World news | Russia-Ukraine war live: Ukraine ‘expects poss... | https://www.theguardian.com/world/live/2023/fe... | News | Russia-Ukraine war live: Ukraine ‘expects poss... | Russia-Ukraine war live: Ukraine ‘expects poss... | 0.049716 | 103 | 15 |
2 | World news | Russia-Ukraine war live: Ukraine struggling to... | https://www.theguardian.com/world/live/2023/ja... | News | Russia-Ukraine war live: Ukraine struggling ho... | Russia-Ukraine war live: Ukraine struggling ho... | 0.018182 | 98 | 15 |
3 | World news | Russia-Ukraine war live: Ukraine aiming for pe... | https://www.theguardian.com/world/live/2022/de... | News | Russia-Ukraine war live: Ukraine aiming peace ... | Russia-Ukraine war live: Ukraine aiming peace ... | 0.005682 | 91 | 13 |
4 | World news | Russia-Ukraine war: Putin replaces general in ... | https://www.theguardian.com/world/live/2023/ja... | News | Russia-Ukraine war: Putin replaces general cha... | Russia-Ukraine war: Putin replaces general cha... | 0.025000 | 98 | 16 |
#Extracting all the news about war in Australia into a Dataframe
df_aus = df[df.sectionName == 'Australia news']
df_aus.head()
sectionName | webTitle | webUrl | pillarName | clean_title | clean_title_v2 | polarity | webTitle_len | word_count | |
---|---|---|---|---|---|---|---|---|---|
133 | Australia news | Australian man who died fighting in Ukraine re... | https://www.theguardian.com/australia-news/202... | News | Australian man died fighting Ukraine remembere... | Australian man died fighting Ukraine remembere... | 0.000000 | 79 | 12 |
204 | Australia news | Australia and France agree arms deal for Ukrai... | https://www.theguardian.com/australia-news/202... | News | Australia France agree arms deal Ukraine talks... | Australia France agree arms deal Ukraine talks... | 0.000000 | 84 | 15 |
207 | Australia news | Family mourns death of ‘treasured and loved’ A... | https://www.theguardian.com/australia-news/202... | News | Family mourns death ‘treasured loved’ Australi... | Family mourns death ‘treasured loved’ Australi... | 0.350000 | 70 | 11 |
259 | Australia news | Defence minister hails ‘heroic’ Ukraine counte... | https://www.theguardian.com/australia-news/liv... | News | Defence minister hails ‘heroic’ Ukraine counte... | Defence minister hails ‘heroic’ Ukraine counte... | 0.700000 | 73 | 10 |
273 | Australia news | Morning Mail: ‘Phantom’ carbon credits reveale... | https://www.theguardian.com/australia-news/202... | News | Morning Mail: ‘Phantom’ carbon credits reveale... | Morning Mail: ‘Phantom’ carbon credits reveale... | 0.285714 | 107 | 15 |
#Extracting all the news about war in UK into a Dataframe
df_uk = df[df.sectionName == 'UK news']
df_uk.head()
sectionName | webTitle | webUrl | pillarName | clean_title | clean_title_v2 | polarity | webTitle_len | word_count | |
---|---|---|---|---|---|---|---|---|---|
65 | UK news | Two missing Britons killed in Ukraine while ev... | https://www.theguardian.com/uk-news/2023/jan/2... | News | Two missing Britons killed Ukraine evacuating ... | Two missing Britons killed Ukraine evacuating ... | -0.20 | 76 | 11 |
94 | UK news | UK seeks more German support as it confirms Ch... | https://www.theguardian.com/uk-news/2023/jan/1... | News | UK seeks German support confirms Challenger ta... | UK seeks German support confirms Challenger ta... | 0.25 | 72 | 12 |
228 | UK news | Liverpool to host Eurovision song contest on b... | https://www.theguardian.com/uk-news/2022/oct/0... | News | Liverpool host Eurovision song contest behalf ... | Liverpool host Eurovision song contest behalf ... | 0.00 | 62 | 10 |
334 | UK news | Migration to UK rises to record 504,000 with U... | https://www.theguardian.com/uk-news/2022/nov/2... | News | Migration UK rises record 504,000 Ukraine Hong... | Migration UK rises record 504,000 Ukraine Hong... | 0.00 | 74 | 13 |
733 | UK news | Two more Britons captured in Ukraine could fac... | https://www.theguardian.com/uk-news/2022/jul/0... | News | Two Britons captured Ukraine could face death ... | Two Britons captured Ukraine could face death ... | 0.50 | 61 | 10 |
df.to_json('Guardian.json')
df_aus.to_json('Guardian_Aus.json')
df_uk.to_json('Guardian_UK.json')
text = df_aus['clean_title_v2'].values
wordcloud = WordCloud().generate(str(text))
figure(figsize=(12, 8), dpi=80)
plt.imshow(wordcloud)
plt.axis("off")
plt.title("Australia News Sentiments Word Cloud")
plt.show()
text = df_uk['clean_title_v2'].values
wordcloud = WordCloud().generate(str(text))
figure(figsize=(12, 8), dpi=80)
plt.imshow(wordcloud)
plt.title("UK News Sentiments Word Cloud")
plt.axis("off")
plt.show()
# Creating histogram
df_aus['polarity'].iplot(
kind='hist',
bins=50,
xTitle='polarity',
linecolor='black',
yTitle='count',
title='Sentiment Polarity Distribution in Australia')
# Creating histogram
df_uk['polarity'].iplot(
kind='hist',
bins=50,
xTitle='polarity',
linecolor='black',
yTitle='count',
title='Sentiment Polarity Distribution in UK')
# Creating histogram
df_aus['webTitle_len'].iplot(
kind='hist',
bins=100,
xTitle='review length',
linecolor='black',
yTitle='count',
title='webTitle Length Distribution in Aus')
# Creating histogram
df_uk['webTitle_len'].iplot(
kind='hist',
bins=100,
xTitle='review length',
linecolor='black',
yTitle='count',
title='webTitle Length Distribution in Uk')
# Creating histogram
df_aus['word_count'].iplot(
kind='hist',
bins=100,
xTitle='word count',
linecolor='black',
yTitle='count',
title='Webtitle Word Count Distribution in Australia')
# Creating histogram
df_uk['word_count'].iplot(
kind='hist',
bins=100,
xTitle='word count',
linecolor='black',
yTitle='count',
title='Webtitle Word Count Distribution in UK')
# Creating bar Chart
def get_top_n_words(corpus, n=None):
vec = CountVectorizer(stop_words = 'english').fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
common_words = get_top_n_words(df_aus['clean_title_v2'], 20)
#for word, freq in common_words:
# print(word, freq)
df2 = pd.DataFrame(common_words, columns = ['clean_title_v2' , 'count'])
df2.groupby('clean_title_v2').sum()['count'].sort_values(ascending=False).iplot(
kind='bar', yTitle='Count', linecolor='black', title='Top 20 words in Webtitle in Australia')
# Creating bar Chart
def get_top_n_words(corpus, n=None):
vec = CountVectorizer(stop_words = 'english').fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
common_words = get_top_n_words(df_uk['clean_title_v2'], 20)
#for word, freq in common_words:
# print(word, freq)
df2 = pd.DataFrame(common_words, columns = ['clean_title_v2' , 'count'])
df2.groupby('clean_title_v2').sum()['count'].sort_values(ascending=False).iplot(
kind='bar', yTitle='Count', linecolor='black', title='Top 20 words in Webtitle in UK')
# Creating bar Chart
def get_top_n_bigram(corpus, n=None):
vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
common_words = get_top_n_bigram(df_aus['clean_title_v2'], 20)
#for word, freq in common_words:
# print(word, freq)
df4 = pd.DataFrame(common_words, columns = ['clean_title_v2' , 'count'])
df4.groupby('clean_title_v2').sum()['count'].sort_values(ascending=False).iplot(
kind='bar', yTitle='Count', linecolor='black', title='Top 20 bigrams in WebTitle in Australia')
# Creating bar Chart
def get_top_n_bigram(corpus, n=None):
vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
common_words = get_top_n_bigram(df_uk['clean_title_v2'], 20)
#for word, freq in common_words:
# print(word, freq)
df4 = pd.DataFrame(common_words, columns = ['clean_title_v2' , 'count'])
df4.groupby('clean_title_v2').sum()['count'].sort_values(ascending=False).iplot(
kind='bar', yTitle='Count', linecolor='black', title='Top 20 bigrams in WebTitle in UK')
# Creating bar Chart
def get_top_n_trigram(corpus, n=None):
vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
common_words = get_top_n_trigram(df_aus['clean_title_v2'], 20)
#for word, freq in common_words:
# print(word, freq)
df6 = pd.DataFrame(common_words, columns = ['clean_title_v2' , 'count'])
df6.groupby('clean_title_v2').sum()['count'].sort_values(ascending=False).iplot(
kind='bar', yTitle='Count', linecolor='black', title='Top 20 trigrams in WebTitle in Australia')
# Creating bar Chart
def get_top_n_trigram(corpus, n=None):
vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
common_words = get_top_n_trigram(df_uk['clean_title_v2'], 20)
#for word, freq in common_words:
# print(word, freq)
df6 = pd.DataFrame(common_words, columns = ['clean_title_v2' , 'count'])
df6.groupby('clean_title_v2').sum()['count'].sort_values(ascending=False).iplot(
kind='bar', yTitle='Count', linecolor='black', title='Top 20 trigrams in WebTitle in UK')
# Creating Box Plot
y0 = df.loc[df['sectionName'] == 'UK news']['polarity']
y1 = df.loc[df['sectionName'] == 'Australia news']['polarity']
y2 = df.loc[df['sectionName'] == 'World news']['polarity']
trace0 = go.Box(
y=y0,
name = 'UK news',
marker = dict(
color = 'rgb(214, 12, 140)',
)
)
trace1 = go.Box(
y=y1,
name = 'Australia news',
marker = dict(
color = 'rgb(0, 128, 128)',
)
)
trace2 = go.Box(
y=y2,
name = 'World news',
marker = dict(
color = 'rgb(10, 140, 208)',
)
)
data = [trace0, trace1, trace2]
layout = go.Layout(
title = "Sentiment Polarity News in Australia, UK and Around the World "
)
fig = go.Figure(data=data,layout=layout)
iplot(fig, filename = "Sentiment Polarity News in Australia, UK and Around the World ")
The Russo-Ukrainian War is an ongoing war between Russia (together with pro-Russian separatist forces) and Ukraine. It began in February 2014 following the Ukrainian Revolution of Dignity, and initially focused on the status of Crimea and parts of the Donbas, internationally recognized as part of Ukraine. The first eight years of the conflict included the Russian annexation of Crimea (2014) and the war in Donbas (2014–present) between Ukraine and Russian-backed separatists, as well as naval incidents, cyberwarfare, and political tensions. Following a Russian military build-up on the Russia–Ukraine border from late 2021, the conflict expanded significantly when Russia launched a full-scale invasion of Ukraine on 24 February 2022.
The news of Russo-Ukrainian war escalation has spread like wildfire across the world. The world being a global village has reacted to the news in contrasting behaviour. We are in this project, comparing the sentiments on the news that is being broadcasted in Australia and UK. We first extracted the dataset from the Guardian API, computed sentiment analysis on all the news headlines and classified them into positive, neutral and positive news. We also classified the news dataframe according to Australia and UK news specific to Ukraine Invasion.
We first tried to find the keywords that were used in the title in both Australian and UK news which showed which specific terms were focused in news headlines in each country. Based on the visual it was clear that Australian news was more neutral as it only focused on generic headlines that is being broadcasted throughout the world. The UK news seems more concerned about the potential refugee crisis created by the situation and how it is impacting their country.
We then viewed the word frequency of each word itself and in pairs as well for both Australia and UK news. Based on the visual we could clearly see that in Australia the term Ukraine was often followed by Russia, war, ceasefire and sanction which showed a generic/neutral sentiment being built in the news. Whereas in the UK the term Ukraine is followed by refugee crisis, help ukraine and imposing sanctions showing a more negative sentiment to Russias Invasion on Ukraine. This is possibly because of the possible refugees arriving in the Uk due to its close proximity and accessibility.
We then compared the news sentiments of Australian News to that of UK as well as the news around the World. We saw that The news sentiments in UK were rather negative where as in Australia news, the sentiments were gradually positive as the word frequency showed there were hopes of ceasefire, whereas in UK we could see the sentiments were negative as they were more concerned about the refugee crisis looming and impacting their economy. The world news on the other hand was rather neutral to divided showing the world in a dilemma about the potential crisis that will impact the world economy and hope for potential ceasefire to avoid further bloodshed.
Based on the Analysis we can clearly see the sentiment variation in both the countries and how the news varies in both countries due to their geographic location and impact.